import pandas as pd
import numpy as np
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import seaborn as sns
import datetime
import folium
import json
from branca.colormap import linear
%matplotlib notebook
The goal of this data analysis research is to better understand sexism problems in Brazil. It is well known that women face much more obstacles then men in several areas in our society. But seeing those differences with charts makes easier to convey how large this difference is.
Making a quick search on google, you can find several articles about the disparity of women and men salaries, work top positions, percentage of doctorates and much more. Talking specifically about Brazil, we as a country suffer with lack of open data and infrastructure to understand the problems of our society. As a consequence, finding data about the differences of women and men such as opportunities and investments is a challenge.
Since we live in a capitalist society, income is a major source of opportunity regardless of the area we are talking about. In other words, analysing the wage differences between women and men will enlight us torwards a better analytical approach to figure out why men have much more opportunities then women.
In Brazil, we have several types of companies. Those types are defined by the amount of income they receive by year. The data below is released every quarter by the federal government of Brazil to show the amount of credit concieved to small entrepreneurs by the federal government.
The data can be found in the folowing links:
df_mei_fem = pd.read_csv('Data/saldo_credito_mei_feminino.csv', sep=';',encoding='latin1')
df_mei_fem['data'] = pd.to_datetime(pd.Series(df_mei_fem['data']), format="%d/%m/%Y")
df_mei_fem['data_quarter'] = df_mei_fem['data']
df_mei_fem['data_quarter'] = df_mei_fem['data_quarter'].dt.to_period("Q")
df_mei_fem = df_mei_fem.set_index('data')
df_mei_fem['valor'] = df_mei_fem['valor'].apply(lambda x: x.replace(',','.'))
df_mei_fem['valor'] = df_mei_fem['valor'].astype('float')
df_mei_mas = pd.read_csv('Data/saldo_credito_mei_masculino.csv', sep=';',encoding='latin1')
df_mei_mas['data'] = pd.to_datetime(pd.Series(df_mei_mas['data']), format="%d/%m/%Y")
df_mei_mas['data_quarter'] = df_mei_mas['data']
df_mei_mas['data_quarter'] = df_mei_mas['data_quarter'].dt.to_period("Q")
df_mei_mas = df_mei_mas.set_index('data')
df_mei_mas['valor'] = df_mei_mas['valor'].apply(lambda x: x.replace(',','.'))
df_mei_mas['valor'] = df_mei_mas['valor'].astype('float')
df_join = df_mei_fem.copy()
df_join = df_join.drop(columns=['valor'])
df_join['valor_fem'] = df_mei_fem['valor']
df_join['valor_mas'] = df_mei_mas['valor']
df_join['valor_max'] = df_join[['valor_fem', 'valor_mas']].values.max(1)
df_join['valor_min'] = df_join[['valor_fem', 'valor_mas']].values.min(1)
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(df_mei_fem.index, 'valor', data=df_mei_fem,
markerfacecolor='blue', markersize=12, color='#394989',
label = 'Women')
ax.plot(df_mei_mas.index , 'valor', data=df_mei_mas,
markerfacecolor='red', markersize=12, color='#cf1b1b',
label = 'Men')
ax.fill_between(df_mei_mas.index.tolist(), df_join['valor_min'], df_join['valor_max'], facecolor='#000000', alpha=0.1)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticklabels(df_mei_fem['data_quarter'])
ttl = ax.set_title('Difference between women\'s and men\'s small\n companies credit received by the federal government')
ttl.set_position([0.5, 0.95])
plt.setp(ax.get_xticklabels(), rotation=30, ha="right")
plt.xlabel('Quarter', fontweight='bold', labelpad=20)
plt.ylabel('Millions of R$ - reais', fontweight='bold', labelpad=20)
plt.legend()
plt.subplots_adjust(bottom=0.3)
plt.show()
The chart above, show us the huge difference between the credit conceived for each gender. The data is a representation of the amount of incentive mens entrepreneurs receive over women entrepreneurs. Thus, is expected that men generate even more income, since they are receiving more credits. This chart alone isn't enough to prove the existence of gender disparity in Brazil but it can give a clear view about it.
Understanding the grades disparity between young women and men may be a good indicator of why we see much more opportunities for the former than the latter. To achieve this goal it was used a national test - ENEM - such as SAT in United States.
dados_enem = pd.read_csv('Data/microdados_enem_2019/DADOS/MICRODADOS_ENEM_2019.csv',
sep=';',encoding='latin1')
dados_enem.head()
The test - ENEM - is devided in 5 areas:
The idea is to calculate the average grade by gender and state. Thus, the charts obtained will give us a norrow view of which state has more disparity and which has not.
It is important to highlight that the grades in the ENEM are within a range of 0 to 1000.
dados_enem_g = dados_enem.groupby(['SG_UF_RESIDENCIA', 'TP_SEXO']).mean()
dados_enem_g = dados_enem_g[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO']]
dados_enem_g = dados_enem_g.reset_index()
dados_enem_g = dados_enem_g.rename(columns={"SG_UF_RESIDENCIA": "estado"})
dados_enem_g = dados_enem_g.set_index('estado')
dados_enem_g.head()
geo_json_data = json.load(open('Data/EstadosBrasileiros/br_states.json'))
def catergoryMap(df, title):
colormap = linear.YlOrRd_09.scale(df.min(),df.max())
mapa = folium.Map(
width=600, height=400,
location=[-15.77972, -47.92972],
zoom_start=3.5
)
folium.GeoJson(
geo_json_data,
name='2019',
style_function=lambda feature: {
'fillColor': colormap(df[feature['id']]),
'color': 'black',
'weight': 0.3,
}
).add_to(mapa)
colormap.caption = title
colormap.add_to(mapa)
folium.LayerControl(collapsed=False).add_to(mapa)
return mapa
## Women
df_w_mean_grade = dados_enem_g.copy()
df_w_mean_grade = df_w_mean_grade[df_w_mean_grade['TP_SEXO'] == 'F']
col = df_w_mean_grade.loc[: , "NU_NOTA_CN":"NU_NOTA_REDACAO"]
df_w_mean_grade['mean_grade'] = col.mean(axis=1)
## Men
df_m_mean_grade = dados_enem_g.copy()
df_m_mean_grade = df_m_mean_grade[df_m_mean_grade['TP_SEXO'] == 'M']
col = df_m_mean_grade.loc[: , "NU_NOTA_CN":"NU_NOTA_REDACAO"]
df_m_mean_grade['mean_grade'] = col.mean(axis=1)
catergoryMap(df_w_mean_grade['mean_grade'], 'Women mean grade per state')